R Markdown

# Required libraries
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(ggplot2)
library(reticulate)
set.seed(666)

# Set up file names and conditions
m_files = c("data/reads.Sample1_MCSF_R1_001.fastq_bq10_star_corrected.umi.dge.txt", "data/mcsf_day6_1.txt", "data/mcsf_day6_2.txt")
m_donors = c("2", "1a", "1a")
g_files = c("data/gmcsf_day6_1.txt", "data/gmcsf_day6_2.txt", "data/reads.Sample2_gMCSF_R1_001.fastq_bq10_star_corrected.umi.dge.txt")
g_donors = c("1a", "1a", "2")
files = c(m_files, g_files)
donornames = c(m_donors, g_donors)
stimnames = c(rep("M", length(m_donors)), rep("G", length(g_donors)))
# Returns Seurat Object with donor information and stimulation condition as metadata
loadData <- function(filename, stimname, donorname) {
    raw_counts <- read.table(file = filename, header = TRUE, row.names = 1, sep = "\t", stringsAsFactors = FALSE)
    raw <- CreateSeuratObject(counts = raw_counts, project = "mcsf-gmcsf", min.cells = 3, min.features = 200)
    raw$stim <- stimname
    raw$donor <- donorname
    Idents(raw) <- "stim"
    return(raw)
}

performQC <- function(raw, plot = FALSE) {
    # Calculate percent mitochondrial reads
    mito.genes <- grep(pattern = "^MT-", x = rownames(x = GetAssayData(object = raw)), value = TRUE)
    raw[["percent.mt"]] <- (colSums(GetAssayData(object = raw, slot = "counts")[mito.genes, ])/colSums(GetAssayData(object = raw, 
        slot = "counts")) * 100)
    
    if (plot) {
        scatter <- FeatureScatter(object = raw, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "percent.mt")
        print(scatter)
        
        # Count QC
        counts = raw$nCount_RNA
        cts <- qplot(counts, geom = "histogram", bins = 100)
        cts_lo <- qplot(counts[counts < 5000], geom = "histogram", bins = 100)
        cts_hi <- qplot(counts[counts > 10000], geom = "histogram", bins = 100)
        cts_grid <- plot_grid(cts, cts_lo, cts_hi)
        print(cts_grid)
        
        # Gene QC
        genes = raw$nFeature_RNA
        gen <- qplot(genes, geom = "histogram", bins = 100)
        gen_lo <- qplot(genes[genes < 1000], geom = "histogram", bins = 100)
        gen_grid <- plot_grid(gen, gen_lo)
        print(gen_grid)
        
        vln <- VlnPlot(object = raw, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), group.by = "stim", ncol = 3)
        print(vln)
    }
    # Filter
    raw <- subset(raw, subset = nCount_RNA > 2500 & nCount_RNA < 75000 & percent.mt < 20)
    raw <- NormalizeData(raw, verbose = FALSE)
    raw <- FindVariableFeatures(raw, selection.method = "vst", nfeatures = 4000)
    return(raw)
}


scaleData <- function(raw) {
    raw <- ScaleData(raw, vars.to.regress = c("nUMI", "percent.mt"), verbose = FALSE)
    raw <- RunPCA(raw, features = VariableFeatures(object = raw), npcs = 100, verbose = FALSE)
    p2 <- DimPlot(raw)
    
    # print(p2)
    
    return(raw)
}
# Load the datasets
alldata <- lapply(1:length(files), function(i) i)
mergeruns <- lapply(1:(length(unique(g_donors)) + length(unique(m_donors))), function(i) i)
ctr = 0
for (i in seq_along(files)) {
    alldata[[i]] <- loadData(files[i], stimnames[i], donornames[i])
    # merge runs with the same donor and stimulation
    if (i == 1 || donornames[i - 1] != donornames[i] || stimnames[i - 1] != stimnames[i]) {
        if (ctr > 0) {
            # cellCycle(mergeruns[[ctr]])
            mergeruns[[ctr]] <- performQC(mergeruns[[ctr]], plot = TRUE)
            mergeruns[[ctr]] <- scaleData(mergeruns[[ctr]])
        }
        ctr = ctr + 1
        mergeruns[[ctr]] <- alldata[[i]]
    } else {
        mergeruns[[ctr]] <- merge(mergeruns[[ctr]], alldata[[i]])
        
    }
}

mergeruns[[length(mergeruns)]] <- performQC(mergeruns[[length(mergeruns)]], plot = TRUE)

scaleData(mergeruns[[length(mergeruns)]])
## An object of class Seurat 
## 16590 features across 1081 samples within 1 assay 
## Active assay: RNA (16590 features)
##  1 dimensional reduction calculated: pca
# Merge all data
merged <- merge(alldata[[1]], y = alldata[2:6])
merged <- performQC(merged, plot = TRUE)

Perform integration

# returns
integrate <- function(list, dims = 50) {
    k.filter = min(200, sapply(list, ncol))
    list.anchors <- FindIntegrationAnchors(object.list = list, dims = 1:dims, k.filter = k.filter)
    list.combined <- IntegrateData(anchorset = list.anchors, dims = 1:dims)
    DefaultAssay(list.combined) <- "integrated"
    return(list.combined)
}

splitdonor <- SplitObject(merged, split.by = "donor")
splitstim <- SplitObject(merged, split.by = "stim")

donor.combined <- integrate(splitdonor)
stim.combined <- integrate(splitstim)
runs.combined <- integrate(mergeruns)
cluster <- function(data, npcs = 30, resolution = 0.5) {
    data <- ScaleData(data)  #, vars.to.regress = c('nUMI', 'percent.mt', 'CC.Difference'), verbose = FALSE)
    data <- RunPCA(data, npcs = 50, verbose = FALSE)
    # t-SNE and Clustering
    data <- RunUMAP(data, reduction = "pca", dims = 1:npcs)
    data <- RunTSNE(data, reduction = "pca", dims = 1:npcs)
    data <- FindNeighbors(data, reduction = "pca", dims = 1:npcs)
    data <- FindClusters(data, resolution = resolution)
    
    return(data)
}

splitstim <- cluster(stim.combined)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6449
## Number of edges: 260790
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8389
## Number of communities: 8
## Elapsed time: 2 seconds
merged <- cluster(merged)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6449
## Number of edges: 256859
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8699
## Number of communities: 8
## Elapsed time: 1 seconds
splitruns <- cluster(runs.combined)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6449
## Number of edges: 322533
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7907
## Number of communities: 9
## Elapsed time: 1 seconds
splitdonor <- cluster(donor.combined)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6449
## Number of edges: 255621
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8290
## Number of communities: 8
## Elapsed time: 1 seconds
# Visualization
vizClust <- function(data, reduction = "umap") {
    p1 <- DimPlot(data, reduction = reduction, split.by = "stim")
    p2 <- DimPlot(data, reduction = reduction, split.by = "donor")
    p3 <- DimPlot(data, reduction = reduction, label = TRUE)
    print(plot_grid(p1, p2, p3, ncol = 1))
}

vizClust(merged)

vizClust(splitdonor)

vizClust(splitstim)

vizClust(splitruns)

markers <- function(data) {
    DefaultAssay(data) <- "RNA"
    Idents(data) <- "stim"
    avg.exp <- log1p(AverageExpression(data, verbose = FALSE)$RNA)
    avg.exp$gene <- rownames(avg.exp)
    p1 <- ggplot(avg.exp, aes(M, G)) + geom_point() + ggtitle("Mph")
    # print(p1)
    roc.markers <- FindMarkers(data, ident.1 = "G", ident.2 = "M", test.use = "roc", verbose = FALSE)
    wilcox.markers <- FindMarkers(data, ident.1 = "G", ident.2 = "M", test.use = "wilcox", verbose = FALSE)
    
    g.response = merge(roc.markers, wilcox.markers, by = 0, all = TRUE)
    print(head(g.response, n = 15))
    
    return(g.response)
}

DefaultAssay(splitruns) <- "RNA"
splitruns.markers <- markers(splitruns)
##    Row.names myAUC   avg_diff power pct.1.x pct.2.x         p_val
## 1      ABCG1 0.616  0.2778750 0.232   0.279   0.049 6.148143e-150
## 2       ACO1 0.631  0.2807387 0.262   0.832   0.687  5.600065e-58
## 3      ACOT7 0.629  0.3210459 0.258   0.716   0.521  5.010557e-60
## 4     ADAM10 0.378 -0.3282016 0.244   0.806   0.862  1.065538e-49
## 5     ADAM15 0.639  0.2531354 0.278   0.599   0.333  2.272680e-80
## 6   ADAMTSL4 0.672  0.2853401 0.344   0.403   0.063 6.870230e-246
## 7      ADAP2 0.387 -0.3132631 0.226   0.558   0.633  1.657069e-45
## 8    AFAP1L1 0.356 -0.4856171 0.288   0.361   0.581  4.321714e-76
## 9     AGPAT9 0.337 -0.3963841 0.326   0.760   0.849  6.457326e-87
## 10    AKR1B1 0.701  0.5226678 0.402   0.778   0.497 4.659978e-142
## 11     ALAS1 0.685  0.4707378 0.370   0.941   0.855 6.903546e-112
## 12     ALCAM 0.336 -0.3640073 0.328   0.923   0.972  4.744433e-88
## 13   ALDH1A2 0.708  0.8786520 0.416   0.621   0.258 1.215774e-192
## 14     ALDH2 0.658  0.4241406 0.316   0.826   0.662  7.648748e-84
## 15   ALOX5AP 0.808  0.8427211 0.616   0.945   0.733 1.198178e-307
##     avg_logFC pct.1.y pct.2.y     p_val_adj
## 1   0.2778750   0.279   0.049 1.202392e-145
## 2   0.2807387   0.832   0.687  1.095205e-53
## 3   0.3210459   0.716   0.521  9.799147e-56
## 4  -0.3282016   0.806   0.862  2.083872e-45
## 5   0.2531354   0.599   0.333  4.444680e-76
## 6   0.2853401   0.403   0.063 1.343611e-241
## 7  -0.3132631   0.558   0.633  3.240730e-41
## 8  -0.4856171   0.361   0.581  8.451975e-72
## 9  -0.3963841   0.760   0.849  1.262859e-82
## 10  0.5226678   0.778   0.497 9.113519e-138
## 11  0.4707378   0.941   0.855 1.350127e-107
## 12 -0.3640073   0.923   0.972  9.278687e-84
## 13  0.8786520   0.621   0.258 2.377690e-188
## 14  0.4241406   0.826   0.662  1.495866e-79
## 15  0.8427211   0.945   0.733 2.343277e-303
markers <- splitruns.markers[splitruns.markers$myAUC > 0.7, ]
markers <- c(markers$Row.names)
print(markers)
##  [1] "AKR1B1"   "ALDH1A2"  "ALOX5AP"  "B2M"      "C19orf59" "CD44"    
##  [7] "CD74"     "CLEC7A"   "CYBB"     "CYP1B1"   "FAM129A"  "FKBP5"   
## [13] "FTH1"     "GLRX"     "HLA-DPA1" "HLA-DRA"  "HLA-DRB1" "LYZ"     
## [19] "METTL9"   "MOB3B"    "MRC1L1"   "PRDX1"    "PTAFR"    "SDCBP"   
## [25] "SLC7A11"  "SORT1"    "STAC"     "TMSB10"   "TPT1"     "TXN"     
## [31] "VDR"
plot.stim <- VlnPlot(splitruns, features = markers, group.by = "stim", pt.size = 0.01, combine = TRUE)
print(plot.stim)

plot.donor <- VlnPlot(splitruns, features = markers, split.by = "stim", group.by = "donor", pt.size = 0, combine = TRUE)
print(plot.donor)

plot.clust <- VlnPlot(splitruns, features = markers, split.by = "stim", pt.size = 0, combine = TRUE)
print(plot.clust)

plot.qc <- VlnPlot(splitruns, features = c("nCount_RNA", "percent.mt"), pt.size = 0, combine = TRUE)
print(plot.qc)

# CombinePlots(plots = plots, ncol = 1)
FeaturePlot(splitruns, features = markers, split.by = "stim", pt.size = 0.15, cols = c("grey", "red"))